PCA по SNP Klebsiella pneumonica.

Нужны эти пакеты

Откроем библиотеки

library("devtools")
library(gdsfmt)
library(SNPRelate)
library(calibrate)

library(vcfR)

У нас было 22 файла после SNP-коллинга, мы их свернули в один, предварительно удалив шапку и заменив название колонки sample1 на соответствующее название штамма.

индексируем

Запихиваем проиндексированные vcf в один файл

Посмотрим что получилось

# the VCF file
vcf.fn <- "/home/daria/Daria/BI/Klebs/vcf/COMBINED.vcf"

snpgdsVCF2GDS(vcf.fn, "CCOMBO.gds")
## VCF Format ==> SNP GDS Format
## Method: exacting biallelic SNPs
## Number of samples: 22
## Parsing "/home/daria/Daria/BI/Klebs/vcf/COMBINED.vcf" ...
##  import 59585 variants.
## + genotype   { Bit2 22x59585, 320.0K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file 'CCOMBO.gds' (445.2K)
##     # of fragments: 49
##     save to 'CCOMBO.gds.tmp'
##     rename 'CCOMBO.gds.tmp' (444.8K, reduced: 348B)
##     # of fragments: 20

Поехали

# open an example dataset (HapMap)
genofile <- snpgdsOpen("/home/daria/Daria/BI/Klebs/CCOMBO.gds") ## open the gate
genofile
## File: /home/daria/Daria/BI/Klebs/CCOMBO.gds (444.8K)
## +    [  ] *
## |--+ sample.id   { Str8 22 LZMA_ra(45.0%), 157B }
## |--+ snp.id   { Int32 59585 LZMA_ra(6.95%), 16.2K }
## |--+ snp.rs.id   { Str8 59585 LZMA_ra(0.26%), 161B }
## |--+ snp.position   { Int32 59585 LZMA_ra(33.7%), 78.4K }
## |--+ snp.chromosome   { Str8 59585 LZMA_ra(0.04%), 309B }
## |--+ snp.allele   { Str8 59585 LZMA_ra(11.7%), 27.3K }
## |--+ genotype   { Bit2 22x59585, 320.0K } *
## \--+ snp.annot   [  ]
##    |--+ qual   { Float32 59585 LZMA_ra(0.08%), 193B }
##    \--+ filter   { Str8 59585 LZMA_ra(0.07%), 205B }
class(genofile)
## [1] "SNPGDSFileClass" "gds.class"
# "SNPGDSFileClass" "gds.class"

pca <- snpgdsPCA(genofile, sample.id = NULL, snp.id = NULL, autosome.only = FALSE,
    remove.monosnp = FALSE, maf = NaN, missing.rate = NaN, eigen.cnt = 32,
    num.thread = 6, bayesian = FALSE, need.genmat = FALSE,
    genmat.only = FALSE, verbose = TRUE)
## Principal Component Analysis (PCA) on genotypes:
## Working space: 22 samples, 59,585 SNPs
##     using 6 (CPU) cores
## PCA:    the sum of all selected genotypes (0,1,2) = 0
## CPU capabilities: Double-Precision SSE2
## Wed Nov 14 03:57:17 2018    (internal increment: 34068)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed in 0s
## Wed Nov 14 03:57:17 2018    Begin (eigenvalues and eigenvectors)
## Wed Nov 14 03:57:17 2018    Done.
tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], EV2 = pca$eigenvect[,2],stringsAsFactors = FALSE)
head(tab) ## слишком мелкие значения
##        sample.id           EV1          EV2
## 1 SRR6480136_095  7.117494e-67 1.907609e-28
## 2 SRR6480137_095  5.888024e-38 1.907609e-28
## 3 SRR6480138_095 1.863011e-264 2.168969e-28
## 4 SRR6480139_095  4.203704e+06 9.716666e-67
## 5 SRR6480140_095  1.087300e-66 1.482646e-71
## 6 SRR6480141_095  9.251745e+39 2.420155e-52

Может быть нормализовать?

library(plotly)
 plot_ly(tab, x = ~log2(EV1), y = ~log2(EV2), text = ~sample.id, title = "log2" ) 
 plot_ly(tab, x = ~log10(EV1), y = ~log10(EV2), text = ~sample.id, title = "log10" ) 

Распределение SNP по позициям

CORR <- snpgdsPCACorr(pca, genofile, eig.which=1:4)
## SNP Correlation:
## Working space: 22 samples, 59585 SNPs
##     using 1 (CPU) core
##     using the top 4 eigenvectors
## Correlation:    the sum of all selected genotypes (0,1,2) = 0
## Wed Nov 14 03:57:19 2018    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed in 0s
## Wed Nov 14 03:57:19 2018    Done.
#plot(abs(CORR$snpcorr[3,]), y = NULL, xlab="SNP Index", ylab="PC 3")
#не поддерживается почему-то при knit
# close the gate
snpgdsClose(genofile)